# topic 11 g
#  Load the functions we will need in this script
source( "../assess_normality.R")
source( "../pop_sd.R")
#
#  Here we are looking to answer questions about
#  probabilities for proportions.  So a typical
#  problem might be:
#
#  We know the true proportion of some characteristic
#  in a population is 65%.  That is, for the whole
#  population, 65% of all things in the population
#  have this characteristic.  Then, if we take a 
#  sample of size 88 from this population, what is
#  the probability that the proportion of the 
#  sample that has that characteristic is less
#  than 55%?

#  As long as we have the time, let us create a
#  large population that has 65% of the population
#  having the characteristic and 35% not having
#  it.
big_pop <- c( rep(1, 6500), rep(2,3500))
#
#   I have had students who do not like having the
#   the values in big_pop be in this nice order, i.e.,
#   6500 ones and then 3500 twos.  For those students
#   we can shuffle the items in big_pop.
source("../shuffle.R")
big_pop <- shuffle( big_pop )
head( big_pop, 30)
#
#   Now, take 10,000 samples, each of size 88,
#   from that population and record the 
#   proportion of the sample that has the
#   characteristic in L1
L1 <- 1:10000
for ( i in 1:10000){
  # get a sample of size 88
    L2 <- sample( big_pop, 88)
    #  find the proportion of items with a 1
    #  in the sample
    num_times <- length( L2[ L2 == 1])
    this_proportion <- num_times/88
    #  add that to our list in L1
    L1[i] <-  this_proportion
}
#  Now let us look at the distribution of the 
#  values in L1
#
summary( L1 ) # compare mean and median
boxplot( L1, horizontal=TRUE)
hist(L1 )

assess_normality( L1 )

pop_sd( L1 )
mean( L1 )
#   compare those to the mathematically
#   predicted mean and standard deviation
#  the mean should be  p  which is 0.65
#  the standard deviation should be
#  sqrt( p*(1-p)/n)
sqrt( 0.65*0.35/88 )
#
#  So what we see is that we can use the normal
#  distribution with mean=p and 
#  sd = sqrt(p*(1-p)/n) to answer questions
#  about the probability associated with
#  a known proportion.

#######################
## small diversion...why the drop in the 
## histogram for the interval 0.606 to 0.608?
## let us look at the possible outcomes
outcomes <-  (1:88)/88
outcomes
#  then look at the number of possible 
#  outcomes in the 0.64 to 0.66, in 0.66 to 0.68,
#  and in 0.68 to 0.70
outcomes[ outcomes>0.64 & outcomes<= 0.66]
outcomes[ outcomes>0.66 & outcomes<= 0.68]
outcomes[ outcomes>0.68 & outcomes<= 0.70]
# this explains the strange low value here 
#  and at a few other places.

# we would not see this if we took samples
# of size 100
#
# let us do that....

#########################

for ( i in 1:10000){
  # get a sample of size 100
  L2 <- sample( big_pop, 100)
  #  find the proportion of items with a 1
  #  in the sample
  num_times <- length( L2[ L2 == 1])
  this_proportion <- num_times/100
  #  add that to our list in L1
  L1[i] <-  this_proportion
}
#  Now let us look at the distribution of the 
#  values in L1
#
summary( L1 ) #compare mean and median
boxplot( L1, horizontal=TRUE)
hist(L1 )

assess_normality( L1 )

pop_sd( L1 )
mean( L1 )
#   compare those to the mathematically
#   predicted mean and standard deviation
#  the mean should be  p  which is 0.65
#  the standard deviation should be
#  sqrt( p*(1-p)/n)
sqrt( 0.65*0.35/100 )


# Then go back to the original question.
#
#  We know the true proportion of some characteristic
#  in a population is 65%.  That is, for the whole
#  population, 65% of all things in the population
#  have this characteristic.  Then, if we take a 
#  sample of size 88 from this population, what is
#  the probability that the proportion of the 
#  sample that has that characteristic is less
#  than 55%?
 
# This is just the same as asking "For a
# normal distribution, 
# N( 0.65, sqrt(0.65*(1-0.65)/88)), what is
# P(X < 0.55)?
# But we know how to do that:
pnorm( 0.55, mean=0.65, sd=sqrt(0.65*0.35/88))

# for a population with a characteristics that
# is known to be in 58% of the population, if
# we take a sample of size 37, what is the 
# probability that the sample will show a 
# proportion greater than 63?
pnorm( 0.63, mean=0.58,
       sd=sqrt(0.58*(1-0.58)/37),
       lower.tail=FALSE)

# If we know that the proportion of people who
# will vote for candidate A in the next election
# is 53%, then in a sample of size 734 what is
# the probability that the proportion of voters
# for candidate A the sample will be less than
# 49% or greater than 57%?
pnorm( 0.49, 0.53, sqrt(0.53*0.47/734))+
  pnorm( 0.57, 0.53, sqrt(0.53*0.47/734),
         lower.tail = FALSE)

############################################
############################################
#  Is this normal approximation always good?
############################################
##
##   Look at a new case, one where n*p<10
##
##   consider the case where  
##   p=0.15, what if our sample size was 12?
##   what would our experiment of 10,000
##   samples look like?
big_pop <- c(rep(1,1500),c(rep(2,8500)))

for ( i in 1:10000){
  # get a sample of size 12
  L2 <- sample( big_pop, 12)
  #  find the proportion of items with a 1
  #  in the sample
  num_times <- length( L2[ L2 == 1])
  this_proportion <- num_times/12
  #  add that to our list in L1
  L1[i] <-  this_proportion
}
#  Now let us look at the distribution of the 
#  values in L1
#
summary( L1 ) # compare mean and median
boxplot( L1, horizontal=TRUE)
hist(L1 )

assess_normality( L1 )

pop_sd( L1 )
mean( L1 )
#   compare those to the mathematically
#   predicted mean and standard deviation
#  the mean should be  p  which is 0.65
#  the standard deviation should be
#  sqrt( p*(1-p)/n)
sqrt( 0.15*0.85/12 )
#
#  Compare the approximation to the model
#  for getting P(X<0.05)
pnorm( 0.05, mean=0.15, sd=sqrt(0.15*0.85/12))
quantile(L1,0.1659877)
# So we have a rule: if n*p>=10 and 
# if n*(1-p)>=10 then we can use the 
# normal approximation for the probabilities.

######################################
## We seem to do the same thing each time we
## run into this problem, that is, we use the 
## population proportion as the mean and the
## expression sqrt( p*(1-p)/n) as the standard 
## deviation.  Could we put this into a 
## function?  Yes, look at pprop().
source("../pprop.R")
#
# remember that we did
pnorm( 0.55, mean=0.65, sd=sqrt(0.65*0.35/88))
## now try 
pprop(0.55, 0.65, 88)
# or we did
pnorm( 0.63, mean=0.58,
       sd=sqrt(0.58*(1-0.58)/37),
       lower.tail=FALSE)
# now try
pprop( 0.63, 0.58, 37, lower.tail=FALSE)